Carga de paquetes

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.0
## ✓ tidyr   1.1.0     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggthemes)
library(cowplot)
## 
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
##   default ggplot2 theme anymore. To recover the previous
##   behavior, execute:
##   theme_set(theme_cowplot())
## ********************************************************
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
## 
##     theme_map

Ejericio disponible en mi repo: https://solariabiodata.github.io/W3bin-R/

Importación de datos

Descargar datos: datos

eukaryotes <- read_csv("data/eukaryotes.csv", col_names = TRUE)
## Parsed with column specification:
## cols(
##   `#Organism Name` = col_character(),
##   `Organism Groups` = col_character(),
##   Strain = col_character(),
##   BioSample = col_character(),
##   BioProject = col_character(),
##   Assembly = col_character(),
##   Level = col_character(),
##   `Size(Mb)` = col_double(),
##   `GC%` = col_double(),
##   Replicons = col_character(),
##   WGS = col_character(),
##   Scaffolds = col_double(),
##   CDS = col_double(),
##   `Release Date` = col_datetime(format = ""),
##   `GenBank FTP` = col_character(),
##   `RefSeq FTP` = col_character()
## )

Exploración y limpieza de datos

eukaryotes %>% glimpse()
## Rows: 5,131
## Columns: 16
## $ `#Organism Name`  <chr> "Emiliania huxleyi CCMP1516", "Arabidopsis thaliana…
## $ `Organism Groups` <chr> "Eukaryota;Protists;Other Protists", "Eukaryota;Pla…
## $ Strain            <chr> "CCMP1516", NA, "A17", NA, NA, NA, NA, NA, NA, "B80…
## $ BioSample         <chr> "SAMN02744062", "SAMN03081427", "SAMN02299339", "SA…
## $ BioProject        <chr> "PRJNA77753", "PRJNA10719", "PRJNA10791", "PRJNA198…
## $ Assembly          <chr> "GCA_000372725.1", "GCA_000001735.1", "GCA_00021949…
## $ Level             <chr> "Scaffold", "Chromosome", "Chromosome", "Chromosome…
## $ `Size(Mb)`        <dbl> 167.67600, 119.66800, 412.92400, 978.97200, 823.786…
## $ `GC%`             <dbl> 64.5000, 36.0528, 34.0470, 35.1208, 35.7097, 0.0000…
## $ Replicons         <chr> NA, "chromosome 1:NC_003070.9/CP002684.1; chromosom…
## $ WGS               <chr> "AHAL01", NA, "APNO01", "ACUP02", "AEKE02", "FJWB02…
## $ Scaffolds         <dbl> 7795, 7, 2187, 1191, 3224, 72295, 58, 279439, 598, …
## $ CDS               <dbl> 38554, 48350, 57661, 71525, 36010, 0, 41070, 0, 584…
## $ `Release Date`    <dttm> 2013-04-19, 2001-08-13, 2011-08-12, 2010-01-05, 20…
## $ `GenBank FTP`     <chr> "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/372…
## $ `RefSeq FTP`      <chr> "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/372…

Selección de columnas de interés

eukaryotes %>% select(c("#Organism Name","Organism Groups","Strain","Size(Mb)","GC%","CDS")) -> eukaryotes_clean
  
eukaryotes_clean %>% head()
## # A tibble: 6 x 6
##   `#Organism Name`       `Organism Groups`        Strain  `Size(Mb)` `GC%`   CDS
##   <chr>                  <chr>                    <chr>        <dbl> <dbl> <dbl>
## 1 Emiliania huxleyi CCM… Eukaryota;Protists;Othe… CCMP15…       168.  64.5 38554
## 2 Arabidopsis thaliana   Eukaryota;Plants;Land P… <NA>          120.  36.1 48350
## 3 Medicago truncatula    Eukaryota;Plants;Land P… A17           413.  34.0 57661
## 4 Glycine max            Eukaryota;Plants;Land P… <NA>          979.  35.1 71525
## 5 Solanum lycopersicum   Eukaryota;Plants;Land P… <NA>          824.  35.7 36010
## 6 Hordeum vulgare        Eukaryota;Plants;Land P… <NA>         9789.   0       0

Renombrar columnas

eukaryotes_clean %>% rename(organism="#Organism Name",groups="Organism Groups",genome_size_mb="Size(Mb)",gc_percent="GC%") %>% rename_with(tolower) -> eukaryotes_clean

eukaryotes_clean %>% head()
## # A tibble: 6 x 6
##   organism           groups              strain  genome_size_mb gc_percent   cds
##   <chr>              <chr>               <chr>            <dbl>      <dbl> <dbl>
## 1 Emiliania huxleyi… Eukaryota;Protists… CCMP15…           168.       64.5 38554
## 2 Arabidopsis thali… Eukaryota;Plants;L… <NA>              120.       36.1 48350
## 3 Medicago truncatu… Eukaryota;Plants;L… A17               413.       34.0 57661
## 4 Glycine max        Eukaryota;Plants;L… <NA>              979.       35.1 71525
## 5 Solanum lycopersi… Eukaryota;Plants;L… <NA>              824.       35.7 36010
## 6 Hordeum vulgare    Eukaryota;Plants;L… <NA>             9789.        0       0

Separar columna “groups” en tres: “domain”, “kingdom”, “class”

eukaryotes_clean %>% separate(groups,c("domain","kingdom","class"),sep = ";") -> eukaryotes_clean

eukaryotes_clean %>% head()
## # A tibble: 6 x 8
##   organism       domain  kingdom  class   strain genome_size_mb gc_percent   cds
##   <chr>          <chr>   <chr>    <chr>   <chr>           <dbl>      <dbl> <dbl>
## 1 Emiliania hux… Eukary… Protists Other … CCMP1…           168.       64.5 38554
## 2 Arabidopsis t… Eukary… Plants   Land P… <NA>             120.       36.1 48350
## 3 Medicago trun… Eukary… Plants   Land P… A17              413.       34.0 57661
## 4 Glycine max    Eukary… Plants   Land P… <NA>             979.       35.1 71525
## 5 Solanum lycop… Eukary… Plants   Land P… <NA>             824.       35.7 36010
## 6 Hordeum vulga… Eukary… Plants   Land P… <NA>            9789.        0       0

Visualización de datos

¿Cuántos genomas hay por cada grupo?

#ctrl + shift +m 
eukaryotes_clean %>% count(kingdom) %>% arrange(desc(n))   
## # A tibble: 5 x 2
##   kingdom      n
##   <chr>    <int>
## 1 Fungi     2844
## 2 Animals   1250
## 3 Protists   539
## 4 Plants     478
## 5 Other       20

Explorar tamaños de genoma:

eukaryotes_clean %>% group_by(kingdom) %>% summarise(mean=mean(genome_size_mb),sd=sd(genome_size_mb),max=max(genome_size_mb),min=min(genome_size_mb))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 5 x 5
##   kingdom    mean     sd    max    min
##   <chr>     <dbl>  <dbl>  <dbl>  <dbl>
## 1 Animals  1132.  1434.  32394.  0.663
## 2 Fungi      29.9   20.2   216.  2.08 
## 3 Other     108.   129.    543. 12.1  
## 4 Plants   1023.  2738.  27603.  0.986
## 5 Protists   46.7   57.1   808.  1.45

Histograma contando el número de observaciones para una variable:

# geom_bar(stat = "count") vs geom_col(stat="identity")
# count(kingdom)
eukaryotes_clean  %>%  count(kingdom) %>% ggplot(., mapping = aes(x=reorder(kingdom,desc(n)),y=n,fill=kingdom, color=kingdom)) + geom_col(width = 0.001, alpha=0.7) + geom_point(size=4) + scale_fill_brewer(palette = "Set1") + scale_color_brewer(palette = "Set1") + geom_hline(yintercept = 250, color="red") + labs(x="Reino", y="Conteo", title = "Título") + coord_polar() + theme_cowplot() + theme(legend.text = element_text(size=7))

# 1. Agrupar y Reordenar
# 2. Rellenar
# 3. Modificar colores (fill, scale_fill_brewer(palette = "Set1"))
# 3. Añadir capas (p.ej. geom_hline)
# 4. Transparencia  
# 5. Modificar etiquetas de los ejes usando labs(), (símbolos, ej.: expression(paste("Etiqueta ",mu)))
# 6. Modificar sistema de coordenadas: (coord_flip, coord_polar)
# 6. Modificar tema: theme_*, + theme(legend.text = element_text(size=7), legend.key.size = unit(0.5,"cm")

Podemos hacer la misma gráfica a partir de dos variables (una categórica y otra con valores núméricos):

#eukaryotes_clean %>% count(kingdom) %>% ggplot(data = ., mapping = aes(x=reorder(kingdom,desc(n)),y=n,fill=kingdom)) + geom_col()
eukaryotes_clean  %>% ggplot(.,aes(x=cds,y=genome_size_mb,fill=kingdom)) + geom_point() + facet_grid(~kingdom)

#library(plotly)
#ggplotly(grafica)
# 1. Reordenar: reorder()
# 2. Boxplot: geom_boxplot()
# 3. Rellenar y cambiar escala de color: fill=, scale_fill_brewer()
# 4. Cambiar escala (log10): scale_y_log10()
# 5. Invertir coordenadas: coord_flip()
# 6. Modificar tema: theme_*
# 7. Graficar por grupo: facet_wrap()
# 8. Asignar a una variable
# p. Versión interactiva con Plotly: ggplotly

Hacer versión interactiva modificando las etiquetas:

#Haremos una versión utilizando geom_point en vez de geom_boxplot (para analizar los datos de cada organismo):
#Definir estructura de las etiquetas: 
eukaryotes_clean %>%  ggplot(data = ., mapping = aes(x=reorder(class,genome_size_mb),y=genome_size_mb, fill=kingdom, text=paste("Clase:", class,"\n Organismo", organism,"\nTamaño de Genoma:", genome_size_mb))) + 
  geom_point(alpha=0.7, color=NA) + #Añadir puntos, color=NA remueve el borde de cada uno
  scale_fill_brewer( palette = "Set1")  + 
  scale_y_log10() +
  coord_flip() + 
  theme_bw() + 
  facet_wrap(~kingdom) +
  labs(x="",y="Genome Size (Mb)", fill="") -> p 

ggplotly(p, tooltip = c("text")) %>% layout(hoverlabel=list(bgcolor="white"))
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#ggplotly(p)

# Añadir etiquetas para cada organismo con información general: 
# text= paste("Class: ",class,"\nOrganism: ",organism,"\nGenome size (Mb):", genome_size_mb)
# Modificar parámetro tooltip = c("text")
# Modificar menú de gráfica Plotly %>% config(displayModeBar = F) 
# Modificar fondo de la etiqueta: %>% layout(hoverlabel=list(bgcolor="white"))